from pdf2image import convert_from_path
from paddleocr import PaddleOCR, draw_ocr
import cv2
import numpy as np
from PIL import Image

# 设置 PDF 文件路径和输出图像路径
pdf_path = 'E:/项目/zz/EP新项目/每家工厂的磅单和结算单/东台常嘉--银河白纸/过磅单.pdf'
output_dir = 'output_images/'

# 将 PDF 转换为图像
pages = convert_from_path(pdf_path, dpi=200, output_folder=output_dir, fmt='jpg')

# 初始化 PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang="ch")  # 使用中文模型

# 遍历每一页图像并执行 OCR
for idx, page in enumerate(pages):
    image_path = f'{output_dir}page_{idx + 1}.jpg'
    page.save(image_path, 'JPEG')

    # 执行 OCR 识别
    result = ocr.ocr(image_path, cls=True)

    # 可视化结果
    if result is not None:
        boxes = [line[0] for line in result]
        txts = [line[1][0] for line in result]
        scores = [line[1][1] for line in result]
        im_show = draw_ocr(np.array(page), boxes, txts, scores, font_path='doc/fonts/simfang.ttf')
        im_show = Image.fromarray(im_show)
        im_show.save(f'result_page_{idx + 1}.jpg')